# use this line for installing/loading# pacman::p_load()# - packages to load stored in a variable (vector)pkgs <-c("tidyverse","glue","scales","lubridate","patchwork","ggh4x","ggrepel","openintro","ggridges","dsbox","janitor","here","knitr","ggthemes","ggplot2","kableExtra","palmerpenguins","grid","htmltools","plotly","ggforce","cowplot","magick","forcats","stringr")# - load from the character array/vectorpacman::p_load(char=pkgs)# - install tidyverse/dsbox directly from Git Hub# - this allows for the possible need to install on a repo. pull.# - and, if it's already installed just thorw an alert.if (!requireNamespace("dsbox", quietly =TRUE)) {message("Installing 'dsbox' from GitHub (not found locally)...")suppressMessages(devtools::install_github("tidyverse/dsbox"))} else {message("[FYI]\n'dsbox' already installed — skipping GitHub install.")}
# - alert to user packages loaded.# Set number of columns (adjustable)n_cols <-4# Add * to each package namepkgs <-paste0("* ", pkgs)# Calculate number of rows based on total packagesn_rows <-ceiling(length(pkgs) / n_cols)# Pad with empty strings to complete gridpkgs_padded <-c(pkgs, rep("", n_rows * n_cols -length(pkgs)))# Create matrix (fill by row)pkg_matrix <-matrix(pkgs_padded, nrow = n_rows, byrow =TRUE)# Print headercat("The packages loaded:")
The packages loaded:
Code
# Loop and print each row (use invisible to suppress NULL)invisible(apply(pkg_matrix, 1, function(row) {cat(paste(format(row, width =22), collapse =""), "\n")}))
#-------------------------->####################### Basic set Theme up ######################## ---- set theme for ggplot2ggplot2::theme_set(ggplot2::theme_minimal(base_size =14))# set width of code outputoptions(width =65)# set figure parameters for knitrknitr::opts_chunk$set(fig.width =7, # 7" widthfig.asp =0.618, # the golden ratiofig.retina =3, # dpi multiplier for displaying HTML output on retinafig.align ="center", # center align figuresdpi =300# higher dpi, sharper image)## ---- end theme set up
(1) - function block
Q5 ….
Code
# ............ A function block, to handle Q3,Q4 with minimal code duplication# - size as a variableset_dot_size <-1# Function for the "All" group plot (g0)plot_all <-function(data) {ggplot(data, aes(x = explanatory_value, y = mean)) +geom_errorbar(aes(ymin = low, ymax = high), width =0.2) +geom_point(size = set_dot_size, color ="black") +coord_flip() +facet_grid(rows =vars(explanatory),cols =vars(response),labeller =labeller(response =as_labeller(response_labels),explanatory =as_labeller(explanatory_labels) ) ) +theme_minimal(base_size =11) +labs(title ="COVID-19 Vaccine Attitudes by Demographic Group",x =NULL,y =NULL ) +theme(plot.title =element_text(hjust =0.5),strip.background =element_rect(fill = strip_fill_color, color ="black"),strip.placement = strip_placement,strip.text.x =element_text(vjust =0.5,size = strip_text_size,margin =margin(t =20, b =10, r =5, l =5) ),strip.text.y.right =element_text(angle =0,vjust =0.5,margin =margin(t =10, b =10, r =15, l =15) ),axis.text.y =element_blank(),axis.text.x =element_blank(),axis.ticks.x =element_blank() )}# Function for the Age plot (g1)plot_age <-function(data) {ggplot(data, aes(x = explanatory_value, y = mean, group = explanatory_value)) +geom_errorbar(aes(ymin = low, ymax = high), width =0.2) +geom_point(size = set_dot_size, color ="black") +coord_flip() +facet_grid(rows =vars(explanatory),cols =vars(response),labeller =labeller(explanatory =as_labeller(explanatory_labels) ) ) +theme_minimal(base_size =12) +labs(x =NULL,y =NULL ) +theme(strip.background =element_rect(fill = strip_fill_color, color ="black"),strip.placement = strip_placement,strip.text.x =element_blank(),strip.text.y.right =element_text(angle =0,vjust =0.5,margin =margin(t =10, b =10, r =13, l =13) ),axis.text.y =element_text(size =10),panel.spacing =unit(1, "lines"),axis.text.x =element_blank(),axis.ticks.x =element_blank() )}# Function for the Gender plot (g2)plot_gender <-function(data) {ggplot(data, aes(x = explanatory_value, y = mean, group = explanatory_value)) +geom_errorbar(aes(ymin = low, ymax = high), width =0.2) +geom_point(size = set_dot_size, color ="black") +coord_flip() +facet_grid(rows =vars(explanatory),cols =vars(response),labeller =labeller(explanatory =as_labeller(explanatory_labels) ) ) +theme_minimal(base_size =12) +labs(x =NULL,y =NULL ) +theme(strip.background =element_rect(fill = strip_fill_color, color ="black"),strip.placement = strip_placement,strip.text.x =element_blank(),strip.text.y.right =element_text(angle =0,vjust =0.5,margin =margin(t =10, b =10, r =5, l =6) ),axis.text.y =element_text(size =10),axis.text.x =element_blank(),panel.spacing =unit(1, "lines"),axis.ticks.x =element_blank() )}# Function for the Race plot (g3)plot_race <-function(data) {ggplot(data, aes(x = explanatory_value, y = mean, group = explanatory_value)) +geom_errorbar(aes(ymin = low, ymax = high), width =0.2) +geom_point(size = set_dot_size, color ="black") +coord_flip() +facet_grid(rows =vars(explanatory),cols =vars(response),labeller =labeller(explanatory =as_labeller(explanatory_labels) ) ) +theme_minimal(base_size =12) +labs(x =NULL,y =NULL ) +theme(strip.background =element_rect(fill = strip_fill_color, color ="black"),strip.placement = strip_placement,strip.text.x =element_blank(),strip.text.y.right =element_text(angle =0,vjust =0.5,margin =margin(t =10, b =10, r =10, l =10) ),axis.text.y =element_text(size =10),panel.spacing =unit(1, "lines"),axis.text.x =element_blank(),axis.ticks.x =element_blank() )}# Function for the Ethnicity plot (g4)plot_ethnicity <-function(data, sub_title_specific) {ggplot(data, aes(x = explanatory_value, y = mean, group = explanatory_value)) +geom_errorbar(aes(ymin = low, ymax = high), width =0.2) +geom_point(size = set_dot_size, color ="black") +coord_flip() +facet_grid(rows =vars(explanatory),cols =vars(response),labeller =labeller(explanatory =as_labeller(explanatory_labels) ) ) +theme_minimal(base_size =10) +labs(x =NULL,y =paste0("Mean Likert score\n(Error bars: ", sub_title_specific, ")") ) +theme(strip.background =element_rect(fill = strip_fill_color, color ="black"),strip.placement = strip_placement,strip.text.x =element_blank(),strip.text.y.right =element_text(angle =0,vjust =0.5,margin =margin(t =10, b =10, r =4, l =7) ),axis.text.y =element_text(size =10),axis.text.x =element_text(size =10),axis.ticks.x =element_line(),panel.spacing =unit(1, "lines") )}# ..... prepare the variables.# . ethnicity.filter_ethnicity_data <-function(data) { data %>%filter(explanatory =="exp_ethnicity") %>%filter(is.finite(mean), is.finite(low), is.finite(high)) %>%mutate(explanatory_value =recode(as.character(explanatory_value),"1"="Hispanic/Latino","2"="Non-Hispanic/Non-Latino"),explanatory_value =factor(explanatory_value, levels =c("Hispanic/Latino", "Non-Hispanic/Non-Latino" )),explanatory =factor(explanatory, levels =c("All", "exp_age_bin", "exp_gender", "exp_race", "exp_ethnicity" )) )}# . agefilter_age_data <-function(data) { data %>%filter(explanatory =="exp_age_bin") %>%filter(is.finite(mean), is.finite(low), is.finite(high)) %>%mutate(explanatory_value =recode(as.character(explanatory_value),"0"="<20","20"="21-25","25"="26-30","30"=">30" ),explanatory_value =factor(explanatory_value, levels =c("<20", "21-25", "26-30", ">30")),explanatory =factor(explanatory, levels =c("All", "exp_age_bin", "exp_gender", "exp_race")) )}# . genderfilter_gender_data <-function(data) { data %>%filter(explanatory =="exp_gender") %>%filter(is.finite(mean), is.finite(low), is.finite(high)) %>%mutate(explanatory_value =as.character(explanatory_value),explanatory_value =fct_recode(factor(explanatory_value),"Prefer not to say"="4","Non-binary third gender"="3","Male"="0","Female"="1" ),explanatory_value =factor(explanatory_value, levels =rev(c("Prefer not to say","Non-binary third gender","Male","Female" ))),explanatory =factor(explanatory, levels =c("All", "exp_age_bin", "exp_gender", "exp_race")) )}# . racefilter_race_data <-function(data) { data %>%filter(explanatory =="exp_race") %>%filter(is.finite(mean), is.finite(low), is.finite(high)) %>%mutate(explanatory_value =recode(as.character(explanatory_value),"1"="American Indian/Alaska Native","2"="Asian","3"="Black/African American","4"="Native Hawaiian/Other Pacific Islander","5"="White" ),explanatory_value =factor(explanatory_value, levels =rev(c("White","Native Hawaiian/Other Pacific Islander","Black/African American","Asian","American Indian/Alaska Native" ))),explanatory =factor(explanatory, levels =c("All", "exp_age_bin", "exp_gender", "exp_race")) )}
1 - Du Bois challenge.
Du Bois challenge. Recreate the following visualization by W.E.B. Du Bois on family budgets split by income classes for 150 families in Atlanta, Georgia. This visualization was originally created using ink and watercolors.
Note: Since there appears to be some allowable creativity with the features reperesented. I left a scale on the bottom of the parchment, and left off the ‘connecting lines’ connecting the same colored segments together for the stacked bar charts. It ended up being a lot of code - to separately construct and place all pieces of the chart together. First effort. There may be a more efficient way to re-make the plot? . I rendered the output image as html - and I cannot git rid of the small ‘png 2’ label (atm).
png 2
2 - COVID survey - interpretation
Q2 - Interpret what’s occurring in the survey, and discuss any results that go against your intuition. In a chart this large, “interpret” (as opposed to simply describing) really means identifying trends in the data. 1. Trust and Profession: Medical students showed noticeably more variance in agreement (i.e. a broader distribution of Likert scores) than nursing students with the statement “I trust the information that I have received about the vaccines.” This could be interpreted as a difference in behavior/diagnostic techniques between the two professions. 2. Concern and Age: Of note was the general ambivalence towards ‘safety and side effects’, as represented by the average score of ‘3’ across all age groups. While the distribution was wide for all age groups, the general consensus among those in the medical profession was no concern for the safety involved with the vaccine. 3. Vaccination History and Vaccine Perception: Across the board - high ‘strongly agree’ - to the category of ‘I will recommend the vaccine to family, friends, and community members.’ This would show a very strong trend of trusting the science in the medical community.
4. Counter intuitive/interesting One interesting trend was that responses by gender identity showed subtle but noteworthy divergence. For some response variables—especially those touching on vaccine efficacy or side effects—nonbinary respondents’ scores tended to fall at the extremes, reflecting either more trust or more concern.
Another pattern emerged in the age dimension, with younger respondents appearing to show greater trust in vaccines.
Overall, it’s striking that the medical and nursing student community was not in lockstep regarding their interpretations of the science and its safety.
Code
#------- no code necessary ..
3 - COVID survey - reconstruct
Q3 ….
Data Analysis - Q1
📄 The original data frame (raw_preview) has:
- 1123 rows
- 14 columns
✅ Rows with only `response_id` and all other fields missing have been removed.
Original dataset rows: 1121
Rows removed: 10
Cleaned dataset size: 1111 rows × 14 columns
**Rows_Removed**
row:3
row:152
row:153
row:414
row:529
row:556
row:577
row:835
row:987
row:1050
Code
# - Step 1a: print the dim of the original df.original_dim <-dim(raw_preview)cat(glue("📄 The original data frame (`raw_preview`) has:\n","- {original_dim[1]} rows\n","- {original_dim[2]} columns\n\n","⚠️ Rows with no available data (i.e., only `response_id` present)\n will be removed in preprocessing.\n","\n✅ **New Dimensions** of `survey_clean` after cleaning:\n","📊 Rows: {nrow(survey_clean)}\n","📐 Columns: {ncol(survey_clean)}\n"))
📄 The original data frame (`raw_preview`) has:
- 1123 rows
- 14 columns
⚠️ Rows with no available data (i.e., only `response_id` present)
will be removed in preprocessing.
✅ **New Dimensions** of `survey_clean` after cleaning:
📊 Rows: 1111
📐 Columns: 14
Code
#-- ... --- based on info in pdf file and .csv .. encode the following# exp_profession........... # exp_flu_vax.............. # exp_gender............... Q2 What is your gender? # exp_race................. Q3 What is your race? # exp_ethnicity............ Q4 What is your ethnicity? # exp_age_bin.............. Q1 What is your age? # exp_already_vax.......... # resp_safety.............. Q26 Based on my understanding, I believe the vaccine is safe. # resp_confidence_science.. Q34 I am confident in the scientific vetting process for the new COVID vaccines. # resp_concern_safety...... Q27 I am concerned about the safety and side effects of the vaccine. # resp_feel_safe_at_work... Q28 Getting the vaccine will make me feel safer at work. # resp_will_recommend...... Q29 I will recommend the vaccine to family, friends, and community members. # resp_trust_info.......... Q31 I trust the information that I have received about the COVID-19 vaccines.covid_survey_longer <- survey_clean |>pivot_longer(cols =starts_with("exp_"),names_to ="explanatory",values_to ="explanatory_value" ) |>mutate(explanatory_value =as.factor(explanatory_value)) |>filter(!is.na(explanatory_value)) |>pivot_longer(cols =starts_with("resp_"),names_to ="response",values_to ="response_value" )print(covid_survey_longer)
first pivot_longer(): Converts all columns that start with “exp_” (e.g., exp_profession, exp_gender, etc.) from wide format into long format. Creates two new columns: explanatory: holds the original column names (like “exp_profession”) explanatory_value: holds the actual values from those columns (like “Nursing” or “1”) second pivot_longer(): After already pivoting the explanatory variables, this takes the remaining response variables (resp_safety, resp_confidence_science, etc.) and pivots them long as well. Creates two new columns: response: original column name response_value: corresponding value
create the df/tibble: covid_survey_summary_stats_by_group
Code
# - group the data - by explanatory, explanatory_value, and response calc.# - the following stats:# - mean of the response_value# - low 10th percentile of the response_value# - high 90th percentile of the response_value# - rename the df coivd_survey_summart_stats_by_groupcovid_survey_summary_stats_by_group <- covid_survey_longer |>group_by(explanatory, explanatory_value, response) |>summarise(mean =mean(response_value, na.rm =TRUE),low =quantile(response_value, probs =0.10, na.rm =TRUE),high =quantile(response_value, probs =0.90, na.rm =TRUE),.groups ="drop" )print(covid_survey_summary_stats_by_group)
# A tibble: 6 × 6
response mean low high explanatory explanatory_value
<chr> <dbl> <dbl> <dbl> <chr> <fct>
1 resp_concern_s… 3.28 1 5 All ""
2 resp_confidenc… 1.43 1 2 All ""
3 resp_feel_safe… 1.36 1 2 All ""
4 resp_safety 2.03 1 5 All ""
5 resp_trust_info 1.40 1 2 All ""
6 resp_will_reco… 1.21 1 2 All ""
Code
#View(covid_survey_summary_stats_all)
Bind the two df’s create the df/tibble: covid_summary_of_stats
Code
# Get existing levels from grouped dataage_levels <-levels(covid_survey_summary_stats_by_group$explanatory_value)# Add a new level to represent the 'All' groupage_levels_with_all <-c(age_levels, "")# Create the all-summary with the new factor levelcovid_survey_summary_stats_all <- covid_survey_longer |>group_by(response) |>summarise(mean =mean(response_value, na.rm =TRUE),low =quantile(response_value, probs =0.10, na.rm =TRUE),high =quantile(response_value, probs =0.90, na.rm =TRUE),explanatory ="All",explanatory_value =factor("", levels = age_levels_with_all),.groups ="drop" )# Ensure grouped summary has the same levels toocovid_survey_summary_stats_by_group$explanatory_value <-factor( covid_survey_summary_stats_by_group$explanatory_value,levels = age_levels_with_all)# Bind them safely nowcovid_survey_summary_stats <-bind_rows( covid_survey_summary_stats_all, covid_survey_summary_stats_by_group)print(covid_survey_summary_stats)
# A tibble: 132 × 6
response mean low high explanatory explanatory_value
<chr> <dbl> <dbl> <dbl> <chr> <fct>
1 resp_concern_… 3.28 1 5 All ""
2 resp_confiden… 1.43 1 2 All ""
3 resp_feel_saf… 1.36 1 2 All ""
4 resp_safety 2.03 1 5 All ""
5 resp_trust_in… 1.40 1 2 All ""
6 resp_will_rec… 1.21 1 2 All ""
7 resp_concern_… 3.35 2 4.4 exp_age_bin "0"
8 resp_confiden… 1.65 1 2.4 exp_age_bin "0"
9 resp_feel_saf… 1.71 1 3.8 exp_age_bin "0"
10 resp_safety 1.41 1 2 exp_age_bin "0"
# ℹ 122 more rows
Q3e - recreate plot
Code
# Labels for rows (explanatory variables), including Gender and Raceexplanatory_labels <-c(All ="All",exp_age_bin ="Age",exp_gender ="Gender",exp_race ="Race" ,# Added Race labelexp_ethnicity ="Ethnicity")# - call formatting for encoded datacovid_age_only <-filter_age_data(covid_survey_summary_stats_by_group)covid_gender_only <-filter_gender_data(covid_survey_summary_stats_by_group)covid_race_only <-filter_race_data(covid_survey_summary_stats_by_group)covid_ethnicity_only <-filter_ethnicity_data(covid_survey_summary_stats_by_group)# Label mappings for responseresponse_labels <-c(resp_safety ="Vaccine is safe",resp_feel_safe_at_work ="Feel safer\n at work",resp_concern_safety ="Concern about \nvaccine safety",resp_confidence_science ="Confidence in \nscientific vetting",resp_trust_info ="Trust in \nvaccine info",resp_will_recommend ="Will recommend\nvaccine")# Reorder response factor levels to match response_labelscovid_age_only <- covid_age_only %>%mutate(response =factor(response, levels =names(response_labels)))covid_gender_only <- covid_gender_only %>%mutate(response =factor(response, levels =names(response_labels)))# View distinct codes used in the exp_ethnicity variablecovid_survey_summary_stats_by_group %>%filter(explanatory =="exp_ethnicity") %>%mutate(explanatory_value =as.character(explanatory_value)) %>%distinct(explanatory_value) %>%arrange(explanatory_value)
# A tibble: 2 × 1
explanatory_value
<chr>
1 1
2 2
Code
# Vector controlling heights of each row - add height for racerow_heights <-c(0.5, # - 'All' row height — adjust as needed3, # - 'exp_age_bin' row height3, # - 'exp_gender' row height - adjust as desired3, # - 'exp_race' row height - new Race row3# - ethnicity)# Reorder response factor levels for 'All' layercovid_all_only <- covid_survey_summary_stats_all %>%filter(is.finite(mean), is.finite(low), is.finite(high)) %>%mutate(response =factor(response, levels =names(response_labels)))# - vars for standardizing box size row/col# Define variables for strip appearancestrip_fill_color <-"gray90"strip_text_color <-"black"strip_text_size <-10strip_text_face <-"plain"strip_text_angle_x <-0strip_text_angle_y <-0strip_text_vjust_y <-0.5strip_placement <-"outside"# already used in your code# Call some functionsg0 <-plot_all(covid_all_only)# - second layer - Ageg1 <-plot_age(covid_age_only)# - third layer - genderg2 <-plot_gender(covid_gender_only)# Fourth layer - Raceg3 <-plot_race(covid_race_only)# Fifth layer: Ethnicity (if present)g4 <-plot_ethnicity(covid_ethnicity_only,"Error bars in range from 10th to 90th percentile")# Composite plot with 5 layers stacked (All / Age / Gender / Race / Ethnicity)composite_plot <- (g0 / g1 / g2 / g3 / g4 +plot_layout(heights = row_heights)) &theme(plot.margin =margin(0, 0, 0, 0))print(composite_plot)
4 - COVID survey - re-reconstruct
Q4 ….Make Plot from Q3, but use different end point quarantiles. When the error bars represent the 25th and 75th percentiles instead of the 10th and 90th, the intervals become narrower, reflecting a tighter range around the median of the data. This change reduces the apparent variability and uncertainty in responses. Compared to the previous plot, the shorter error bars may make the group differences appear more precise but potentially understate the true variability. Therefore, while the overall trends remain similar, conclusions about the degree of uncertainty should be adjusted to recognize that the interquartile range excludes more extreme values.
# A tibble: 132 × 6
response mean low high explanatory explanatory_value
<chr> <dbl> <dbl> <dbl> <chr> <fct>
1 resp_concern_… 3.28 2 4 All ""
2 resp_confiden… 1.43 1 2 All ""
3 resp_feel_saf… 1.36 1 1 All ""
4 resp_safety 2.03 1 3 All ""
5 resp_trust_in… 1.40 1 2 All ""
6 resp_will_rec… 1.21 1 1 All ""
7 resp_concern_… 3.35 2 4 exp_age_bin "0"
8 resp_confiden… 1.65 1 2 exp_age_bin "0"
9 resp_feel_saf… 1.71 1 2 exp_age_bin "0"
10 resp_safety 1.41 1 2 exp_age_bin "0"
# ℹ 122 more rows
5 - COVID survey - another view
Q5 …. COVID survey - another view. Create two bar charts of the Likert data for the six survey questions in from the plot in Exercise 2. This should be a single plot visualizing the percentages of each possible answer, with different questions on the y-axis. Use an appropriate color scale. Write alt text for your visualization as well.
The following object is masked from 'package:scales':
viridis_pal
Code
# Define response labelsresponse_labels <-c(resp_safety ="Vaccine is safe",resp_feel_safe_at_work ="Feel safer\n at work",resp_concern_safety ="Concern about \nvaccine safety",resp_confidence_science ="Confidence in \nscientific vetting",resp_trust_info ="Trust in \nvaccine info",resp_will_recommend ="Will recommend\nvaccine")# Step 1: Compute % response per question and response_valuelikert_summary <- covid_survey_longer %>%group_by(response, response_value) %>%summarise(count =n(), .groups ="drop") %>%group_by(response) %>%mutate(percent = count /sum(count) *100) %>%ungroup() %>%mutate(response =factor(response, levels =unique(response)), # preserve orderresponse_value =factor(response_value, levels =1:5) )# Center percentages for diverging bar chart (v1)likert_summary <- likert_summary %>%mutate(centered_percent =case_when( response_value <3~-percent, response_value ==3~0, response_value >3~ percent ) )
Warning: There were 2 warnings in `mutate()`.
The first warning was:
ℹ In argument: `centered_percent = case_when(...)`.
Caused by warning in `Ops.factor()`:
! '<' not meaningful for factors
ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining
warning.